In [1]:
import numpy as np #linear algebra
import pandas as pd # data processing,CSV file I/O(e.g pd.read_csv)
import seaborn as sns # for statistical data visualization
import plotly.express as px
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline
from warnings import filterwarnings 
filterwarnings("ignore")
In [2]:
data = pd.read_csv(r"C:\Users\laxma\Downloads\dataset.csv")
data
Out[2]:
Year Month Sector Hydroelectric Power Geothermal Energy Solar Energy Wind Energy Wood Energy Waste Energy Fuel Ethanol, Excluding Denaturant Biomass Losses and Co-products Biomass Energy Total Renewable Energy Renewable Diesel Fuel Other Biofuels Conventional Hydroelectric Power Biodiesel
0 1973 1 Commerical 0.000 0.000 0.000 0.000 0.570 0.000 0.000 0.000 0.570 0.570 0.00 0.000 0.000 0.000
1 1973 1 Electric Power 0.000 0.490 0.000 0.000 0.054 0.157 0.000 0.000 0.211 89.223 0.00 0.000 88.522 0.000
2 1973 1 Industrial 1.040 0.000 0.000 0.000 98.933 0.000 0.000 0.000 98.933 99.973 0.00 0.000 0.000 0.000
3 1973 1 Residential 0.000 0.000 0.000 0.000 30.074 0.000 0.000 0.000 0.000 30.074 0.00 0.000 0.000 0.000
4 1973 1 Transportation 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00 0.000 0.000 0.000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3060 2024 1 Commerical 0.073 1.669 4.267 0.036 7.053 6.233 2.441 0.000 15.728 21.773 0.00 0.000 0.000 0.000
3061 2024 1 Electric Power 0.000 4.667 32.707 119.265 15.071 13.873 0.000 0.000 28.944 257.661 0.00 0.000 72.078 0.000
3062 2024 1 Industrial 0.308 0.356 0.987 0.035 104.878 14.171 1.533 67.742 188.325 190.011 0.00 0.000 0.000 0.000
3063 2024 1 Residential 0.000 3.354 14.897 0.000 34.065 0.000 0.000 0.000 0.000 52.316 0.00 0.000 0.000 0.000
3064 2024 1 Transportation 0.000 0.000 0.000 0.000 0.000 0.000 86.098 0.000 140.188 0.000 30.78 3.442 0.000 19.867

3065 rows × 17 columns

In [3]:
data.head()
Out[3]:
Year Month Sector Hydroelectric Power Geothermal Energy Solar Energy Wind Energy Wood Energy Waste Energy Fuel Ethanol, Excluding Denaturant Biomass Losses and Co-products Biomass Energy Total Renewable Energy Renewable Diesel Fuel Other Biofuels Conventional Hydroelectric Power Biodiesel
0 1973 1 Commerical 0.00 0.00 0.0 0.0 0.570 0.000 0.0 0.0 0.570 0.570 0.0 0.0 0.000 0.0
1 1973 1 Electric Power 0.00 0.49 0.0 0.0 0.054 0.157 0.0 0.0 0.211 89.223 0.0 0.0 88.522 0.0
2 1973 1 Industrial 1.04 0.00 0.0 0.0 98.933 0.000 0.0 0.0 98.933 99.973 0.0 0.0 0.000 0.0
3 1973 1 Residential 0.00 0.00 0.0 0.0 30.074 0.000 0.0 0.0 0.000 30.074 0.0 0.0 0.000 0.0
4 1973 1 Transportation 0.00 0.00 0.0 0.0 0.000 0.000 0.0 0.0 0.000 0.000 0.0 0.0 0.000 0.0
In [4]:
data.tail()
Out[4]:
Year Month Sector Hydroelectric Power Geothermal Energy Solar Energy Wind Energy Wood Energy Waste Energy Fuel Ethanol, Excluding Denaturant Biomass Losses and Co-products Biomass Energy Total Renewable Energy Renewable Diesel Fuel Other Biofuels Conventional Hydroelectric Power Biodiesel
3060 2024 1 Commerical 0.073 1.669 4.267 0.036 7.053 6.233 2.441 0.000 15.728 21.773 0.00 0.000 0.000 0.000
3061 2024 1 Electric Power 0.000 4.667 32.707 119.265 15.071 13.873 0.000 0.000 28.944 257.661 0.00 0.000 72.078 0.000
3062 2024 1 Industrial 0.308 0.356 0.987 0.035 104.878 14.171 1.533 67.742 188.325 190.011 0.00 0.000 0.000 0.000
3063 2024 1 Residential 0.000 3.354 14.897 0.000 34.065 0.000 0.000 0.000 0.000 52.316 0.00 0.000 0.000 0.000
3064 2024 1 Transportation 0.000 0.000 0.000 0.000 0.000 0.000 86.098 0.000 140.188 0.000 30.78 3.442 0.000 19.867
In [5]:
data.shape
Out[5]:
(3065, 17)
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3065 entries, 0 to 3064
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Year                                3065 non-null   int64  
 1   Month                               3065 non-null   int64  
 2   Sector                              3065 non-null   object 
 3   Hydroelectric Power                 3065 non-null   float64
 4   Geothermal Energy                   3065 non-null   float64
 5   Solar Energy                        3065 non-null   float64
 6   Wind Energy                         3065 non-null   float64
 7   Wood Energy                         3065 non-null   float64
 8   Waste Energy                        3065 non-null   float64
 9   Fuel Ethanol, Excluding Denaturant  3065 non-null   float64
 10  Biomass Losses and Co-products      3065 non-null   float64
 11  Biomass Energy                      3065 non-null   float64
 12  Total Renewable Energy              3065 non-null   float64
 13  Renewable Diesel Fuel               3065 non-null   float64
 14  Other Biofuels                      3065 non-null   float64
 15  Conventional Hydroelectric Power    3065 non-null   float64
 16  Biodiesel                           3065 non-null   float64
dtypes: float64(14), int64(2), object(1)
memory usage: 407.2+ KB
In [7]:
data.isnull().sum()
Out[7]:
Year                                  0
Month                                 0
Sector                                0
Hydroelectric Power                   0
Geothermal Energy                     0
Solar Energy                          0
Wind Energy                           0
Wood Energy                           0
Waste Energy                          0
Fuel Ethanol, Excluding Denaturant    0
Biomass Losses and Co-products        0
Biomass Energy                        0
Total Renewable Energy                0
Renewable Diesel Fuel                 0
Other Biofuels                        0
Conventional Hydroelectric Power      0
Biodiesel                             0
dtype: int64
In [8]:
data_filled = data.fillna(0)
print(data_filled.isnull().sum())                          
Year                                  0
Month                                 0
Sector                                0
Hydroelectric Power                   0
Geothermal Energy                     0
Solar Energy                          0
Wind Energy                           0
Wood Energy                           0
Waste Energy                          0
Fuel Ethanol, Excluding Denaturant    0
Biomass Losses and Co-products        0
Biomass Energy                        0
Total Renewable Energy                0
Renewable Diesel Fuel                 0
Other Biofuels                        0
Conventional Hydroelectric Power      0
Biodiesel                             0
dtype: int64
In [9]:
data.duplicated().sum()
Out[9]:
0
In [10]:
data['Year'].unique()
Out[10]:
array([1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983,
       1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
       1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
       2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], dtype=int64)
In [11]:
data['Month'].unique()
Out[11]:
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)
In [12]:
len(data['Month'].unique())
Out[12]:
12
In [13]:
data['Sector'].unique()
Out[13]:
array(['Commerical', 'Electric Power', 'Industrial', 'Residential',
       'Transportation'], dtype=object)
In [14]:
len(data['Sector'].unique())
Out[14]:
5
In [15]:
data.drop(['Month', 'Year'],axis=1, inplace=True)
In [16]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3065 entries, 0 to 3064
Data columns (total 15 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Sector                              3065 non-null   object 
 1   Hydroelectric Power                 3065 non-null   float64
 2   Geothermal Energy                   3065 non-null   float64
 3   Solar Energy                        3065 non-null   float64
 4   Wind Energy                         3065 non-null   float64
 5   Wood Energy                         3065 non-null   float64
 6   Waste Energy                        3065 non-null   float64
 7   Fuel Ethanol, Excluding Denaturant  3065 non-null   float64
 8   Biomass Losses and Co-products      3065 non-null   float64
 9   Biomass Energy                      3065 non-null   float64
 10  Total Renewable Energy              3065 non-null   float64
 11  Renewable Diesel Fuel               3065 non-null   float64
 12  Other Biofuels                      3065 non-null   float64
 13  Conventional Hydroelectric Power    3065 non-null   float64
 14  Biodiesel                           3065 non-null   float64
dtypes: float64(14), object(1)
memory usage: 359.3+ KB
In [17]:
data.head()
Out[17]:
Sector Hydroelectric Power Geothermal Energy Solar Energy Wind Energy Wood Energy Waste Energy Fuel Ethanol, Excluding Denaturant Biomass Losses and Co-products Biomass Energy Total Renewable Energy Renewable Diesel Fuel Other Biofuels Conventional Hydroelectric Power Biodiesel
0 Commerical 0.00 0.00 0.0 0.0 0.570 0.000 0.0 0.0 0.570 0.570 0.0 0.0 0.000 0.0
1 Electric Power 0.00 0.49 0.0 0.0 0.054 0.157 0.0 0.0 0.211 89.223 0.0 0.0 88.522 0.0
2 Industrial 1.04 0.00 0.0 0.0 98.933 0.000 0.0 0.0 98.933 99.973 0.0 0.0 0.000 0.0
3 Residential 0.00 0.00 0.0 0.0 30.074 0.000 0.0 0.0 0.000 30.074 0.0 0.0 0.000 0.0
4 Transportation 0.00 0.00 0.0 0.0 0.000 0.000 0.0 0.0 0.000 0.000 0.0 0.0 0.000 0.0
In [18]:
data.columns
Out[18]:
Index(['Sector', 'Hydroelectric Power', 'Geothermal Energy', 'Solar Energy',
       'Wind Energy', 'Wood Energy', 'Waste Energy',
       'Fuel Ethanol, Excluding Denaturant', 'Biomass Losses and Co-products',
       'Biomass Energy', 'Total Renewable Energy', 'Renewable Diesel Fuel',
       'Other Biofuels', 'Conventional Hydroelectric Power', 'Biodiesel'],
      dtype='object')
In [19]:
#VISUALIZATION
In [20]:
fig=px.violin(data,x='Waste Energy',y='Geothermal Energy',color='Waste Energy')
fig.show()
In [21]:
plt.scatter(data['Geothermal Energy'],data['Solar Energy'],color='red')
plt.xticks(rotation=90)
plt.show()
In [22]:
sns.lineplot(x='Wind Energy', y='Wood Energy', data=data)
Out[22]:
<AxesSubplot:xlabel='Wind Energy', ylabel='Wood Energy'>
In [23]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='Biomass Energy', y='Solar Energy')
plt.xlabel('Biomass Energy')
plt.ylabel('Solar Energy')
plt.show()
In [24]:
sns.displot(data["Renewable Diesel Fuel"])
Out[24]:
<seaborn.axisgrid.FacetGrid at 0x2edc670b670>
In [25]:
sns.relplot(x='Total Renewable Energy',y='Biomass Losses and Co-products',data=data)
Out[25]:
<seaborn.axisgrid.FacetGrid at 0x2edc6266310>
In [26]:
sns.countplot(x='Sector',data=data)
Out[26]:
<AxesSubplot:xlabel='Sector', ylabel='count'>
In [27]:
#MODEL BUILDING
In [28]:
X = data

y = data['Sector']
In [29]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

X['Sector'] = le.fit_transform(X['Sector'])

y = le.transform(y)
In [30]:
X.head()
Out[30]:
Sector Hydroelectric Power Geothermal Energy Solar Energy Wind Energy Wood Energy Waste Energy Fuel Ethanol, Excluding Denaturant Biomass Losses and Co-products Biomass Energy Total Renewable Energy Renewable Diesel Fuel Other Biofuels Conventional Hydroelectric Power Biodiesel
0 0 0.00 0.00 0.0 0.0 0.570 0.000 0.0 0.0 0.570 0.570 0.0 0.0 0.000 0.0
1 1 0.00 0.49 0.0 0.0 0.054 0.157 0.0 0.0 0.211 89.223 0.0 0.0 88.522 0.0
2 2 1.04 0.00 0.0 0.0 98.933 0.000 0.0 0.0 98.933 99.973 0.0 0.0 0.000 0.0
3 3 0.00 0.00 0.0 0.0 30.074 0.000 0.0 0.0 0.000 30.074 0.0 0.0 0.000 0.0
4 4 0.00 0.00 0.0 0.0 0.000 0.000 0.0 0.0 0.000 0.000 0.0 0.0 0.000 0.0
In [31]:
cols = X.columns
In [32]:
from sklearn.preprocessing import MinMaxScaler

ms = MinMaxScaler()

X = ms.fit_transform(X)
In [33]:
X = pd.DataFrame(X, columns=[cols])
In [34]:
X.head()
Out[34]:
Sector Hydroelectric Power Geothermal Energy Solar Energy Wind Energy Wood Energy Waste Energy Fuel Ethanol, Excluding Denaturant Biomass Losses and Co-products Biomass Energy Total Renewable Energy Renewable Diesel Fuel Other Biofuels Conventional Hydroelectric Power Biodiesel
0 0.00 0.000976 0.000000 0.0 0.0 0.003104 0.000000 0.0 0.0 0.002444 0.001850 0.0 0.0 0.00000 0.0
1 0.25 0.000976 0.082339 0.0 0.0 0.000294 0.004776 0.0 0.0 0.000905 0.289521 0.0 0.0 0.75368 0.0
2 0.50 0.508541 0.000000 0.0 0.0 0.538769 0.000000 0.0 0.0 0.424241 0.324403 0.0 0.0 0.00000 0.0
3 0.75 0.000976 0.000000 0.0 0.0 0.163777 0.000000 0.0 0.0 0.000000 0.097587 0.0 0.0 0.00000 0.0
4 1.00 0.000976 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.0 0.00000 0.0
In [35]:
from sklearn.cluster import KMeans
In [36]:
from sklearn.cluster import KMeans
cs = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(X)
    cs.append(kmeans.inertia_)
plt.plot(range(1, 11), cs)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_20800\957846521.py in <module>
      3 for i in range(1, 11):
      4     kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
----> 5     kmeans.fit(X)
      6     cs.append(kmeans.inertia_)
      7 plt.plot(range(1, 11), cs)

D:\anaconda files\lib\site-packages\sklearn\cluster\_kmeans.py in fit(self, X, y, sample_weight)
   1184 
   1185             # run a k-means once
-> 1186             labels, inertia, centers, n_iter_ = kmeans_single(
   1187                 X,
   1188                 sample_weight,

D:\anaconda files\lib\site-packages\sklearn\cluster\_kmeans.py in _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter, verbose, x_squared_norms, tol, n_threads)
    623     # Threadpoolctl context to limit the number of threads in second level of
    624     # nested parallelism (i.e. BLAS) to avoid oversubsciption.
--> 625     with threadpool_limits(limits=1, user_api="blas"):
    626         for i in range(max_iter):
    627             lloyd_iter(

D:\anaconda files\lib\site-packages\sklearn\utils\fixes.py in threadpool_limits(limits, user_api)
    312         return controller.limit(limits=limits, user_api=user_api)
    313     else:
--> 314         return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api)
    315 
    316 

D:\anaconda files\lib\site-packages\threadpoolctl.py in __init__(self, limits, user_api)
    169             self._check_params(limits, user_api)
    170 
--> 171         self._original_info = self._set_threadpool_limits()
    172 
    173     def __enter__(self):

D:\anaconda files\lib\site-packages\threadpoolctl.py in _set_threadpool_limits(self)
    266             return None
    267 
--> 268         modules = _ThreadpoolInfo(prefixes=self._prefixes,
    269                                   user_api=self._user_api)
    270         for module in modules:

D:\anaconda files\lib\site-packages\threadpoolctl.py in __init__(self, user_api, prefixes, modules)
    338 
    339             self.modules = []
--> 340             self._load_modules()
    341             self._warn_if_incompatible_openmp()
    342         else:

D:\anaconda files\lib\site-packages\threadpoolctl.py in _load_modules(self)
    371             self._find_modules_with_dyld()
    372         elif sys.platform == "win32":
--> 373             self._find_modules_with_enum_process_module_ex()
    374         else:
    375             self._find_modules_with_dl_iterate_phdr()

D:\anaconda files\lib\site-packages\threadpoolctl.py in _find_modules_with_enum_process_module_ex(self)
    483 
    484                 # Store the module if it is supported and selected
--> 485                 self._make_module_from_path(filepath)
    486         finally:
    487             kernel_32.CloseHandle(h_process)

D:\anaconda files\lib\site-packages\threadpoolctl.py in _make_module_from_path(self, filepath)
    513             if prefix in self.prefixes or user_api in self.user_api:
    514                 module_class = globals()[module_class]
--> 515                 module = module_class(filepath, prefix, user_api, internal_api)
    516                 self.modules.append(module)
    517 

D:\anaconda files\lib\site-packages\threadpoolctl.py in __init__(self, filepath, prefix, user_api, internal_api)
    604         self.internal_api = internal_api
    605         self._dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
--> 606         self.version = self.get_version()
    607         self.num_threads = self.get_num_threads()
    608         self._get_extra_info()

D:\anaconda files\lib\site-packages\threadpoolctl.py in get_version(self)
    644                              lambda: None)
    645         get_config.restype = ctypes.c_char_p
--> 646         config = get_config().split()
    647         if config[0] == b"OpenBLAS":
    648             return config[1].decode("utf-8")

AttributeError: 'NoneType' object has no attribute 'split'
In [ ]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters= 2, random_state=0) 

kmeans.fit(X)
In [ ]:
kmeans.cluster_centers_
In [ ]:
kmeans.inertia_
In [ ]:
labels = kmeans.labels_

# check how many of the samples were correctly labeled
correct_labels = sum(y == labels)

print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
In [ ]:
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
In [ ]:
# k= 3

kmeans = KMeans(n_clusters=3, random_state=0)

kmeans.fit(X)

# check how many of the samples were correctly labeled
labels = kmeans.labels_

correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
In [ ]:
# k= 5

kmeans = KMeans(n_clusters=5, random_state=0)

kmeans.fit(X)

# check how many of the samples were correctly labeled
labels = kmeans.labels_

correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
In [ ]:
 
In [ ]: